{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cot import Collection\n",
    "from cot.evaluate import compare_evaluation_difference\n",
    "from cot.stats import evaluation_as_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll = Collection.from_json(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_33_paper.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.evaluate(overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll = Collection.from_json(\"eval_bool_3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "| Name           | Train   | Valid   | Test   |\n",
       "|----------------|---------|---------|--------|\n",
       "| commonsense_qa | -       | 33      | -      |\n",
       "| med_qa         | -       | -       | 33     |\n",
       "| medmc_qa       | -       | 33      | -      |\n",
       "| open_book_qa   | -       | -       | 33     |\n",
       "| strategy_qa    | 33      | -       | -      |\n",
       "| worldtree      | -       | -       | 33     |\n",
       "\n",
       "Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'pubmed_qa', 'qed', 'svamp']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coll"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found difference in collection old/new evaluation overwrite. Files for comparison are created: compare_evaluation_20230426-144030_old.json and compare_evaluation_20230426-144030_new.json\n"
     ]
    }
   ],
   "source": [
    "compare_evaluation_difference(coll)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "89c5920d65b643339a9e77f18808c690",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kon/work/ThoughtSource/libs/cot/cot/stats.py:406: PerformanceWarning: indexing past lexsort depth may impact performance.\n",
      "  df.loc[dataset, (instruction + \"_\" + cot_trigger, model)] = v\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_5378c_row0_col17, #T_5378c_row1_col17 {\n",
       "  font-weight: bold;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_5378c\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_5378c_level0_col0\" class=\"col_heading level0 col0\" colspan=\"6\">None_None</th>\n",
       "      <th id=\"T_5378c_level0_col6\" class=\"col_heading level0 col6\" colspan=\"6\">None_kojima-01</th>\n",
       "      <th id=\"T_5378c_level0_col12\" class=\"col_heading level0 col12\" >None_kojima-03</th>\n",
       "      <th id=\"T_5378c_level0_col13\" class=\"col_heading level0 col13\" >None_kojima-09</th>\n",
       "      <th id=\"T_5378c_level0_col14\" class=\"col_heading level0 col14\" colspan=\"6\">None_zhou-01</th>\n",
       "      <th id=\"T_5378c_level0_col20\" class=\"col_heading level0 col20\" >qa-01_None</th>\n",
       "      <th id=\"T_5378c_level0_col21\" class=\"col_heading level0 col21\" >qa-05_None</th>\n",
       "      <th id=\"T_5378c_level0_col22\" class=\"col_heading level0 col22\" >qa-08_None</th>\n",
       "      <th id=\"T_5378c_level0_col23\" class=\"col_heading level0 col23\" >qa-09_None</th>\n",
       "      <th id=\"T_5378c_level0_col24\" class=\"col_heading level0 col24\" colspan=\"6\">qa-10_None</th>\n",
       "      <th id=\"T_5378c_level0_col30\" class=\"col_heading level0 col30\" colspan=\"6\">qa-12_None</th>\n",
       "      <th id=\"T_5378c_level0_col36\" class=\"col_heading level0 col36\" colspan=\"6\">qa-13_None</th>\n",
       "      <th id=\"T_5378c_level0_col42\" class=\"col_heading level0 col42\" colspan=\"6\">qa-16_None</th>\n",
       "      <th id=\"T_5378c_level0_col48\" class=\"col_heading level0 col48\" colspan=\"6\">qa-17_None</th>\n",
       "      <th id=\"T_5378c_level0_col54\" class=\"col_heading level0 col54\" colspan=\"6\">refl-01_None</th>\n",
       "      <th id=\"T_5378c_level0_col60\" class=\"col_heading level0 col60\" colspan=\"6\">zhou-01-ins_None</th>\n",
       "      <th id=\"T_5378c_level0_col66\" class=\"col_heading level0 col66\" >zhou-01-ins_zhou-01</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th id=\"T_5378c_level1_col0\" class=\"col_heading level1 col0\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col1\" class=\"col_heading level1 col1\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col2\" class=\"col_heading level1 col2\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col3\" class=\"col_heading level1 col3\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col4\" class=\"col_heading level1 col4\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col5\" class=\"col_heading level1 col5\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col6\" class=\"col_heading level1 col6\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col7\" class=\"col_heading level1 col7\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col8\" class=\"col_heading level1 col8\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col9\" class=\"col_heading level1 col9\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col10\" class=\"col_heading level1 col10\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col11\" class=\"col_heading level1 col11\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col12\" class=\"col_heading level1 col12\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col13\" class=\"col_heading level1 col13\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col14\" class=\"col_heading level1 col14\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col15\" class=\"col_heading level1 col15\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col16\" class=\"col_heading level1 col16\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col17\" class=\"col_heading level1 col17\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col18\" class=\"col_heading level1 col18\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col19\" class=\"col_heading level1 col19\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col20\" class=\"col_heading level1 col20\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col21\" class=\"col_heading level1 col21\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col22\" class=\"col_heading level1 col22\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col23\" class=\"col_heading level1 col23\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col24\" class=\"col_heading level1 col24\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col25\" class=\"col_heading level1 col25\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col26\" class=\"col_heading level1 col26\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col27\" class=\"col_heading level1 col27\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col28\" class=\"col_heading level1 col28\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col29\" class=\"col_heading level1 col29\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col30\" class=\"col_heading level1 col30\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col31\" class=\"col_heading level1 col31\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col32\" class=\"col_heading level1 col32\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col33\" class=\"col_heading level1 col33\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col34\" class=\"col_heading level1 col34\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col35\" class=\"col_heading level1 col35\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col36\" class=\"col_heading level1 col36\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col37\" class=\"col_heading level1 col37\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col38\" class=\"col_heading level1 col38\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col39\" class=\"col_heading level1 col39\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col40\" class=\"col_heading level1 col40\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col41\" class=\"col_heading level1 col41\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col42\" class=\"col_heading level1 col42\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col43\" class=\"col_heading level1 col43\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col44\" class=\"col_heading level1 col44\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col45\" class=\"col_heading level1 col45\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col46\" class=\"col_heading level1 col46\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col47\" class=\"col_heading level1 col47\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col48\" class=\"col_heading level1 col48\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col49\" class=\"col_heading level1 col49\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col50\" class=\"col_heading level1 col50\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col51\" class=\"col_heading level1 col51\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col52\" class=\"col_heading level1 col52\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col53\" class=\"col_heading level1 col53\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col54\" class=\"col_heading level1 col54\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col55\" class=\"col_heading level1 col55\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col56\" class=\"col_heading level1 col56\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col57\" class=\"col_heading level1 col57\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col58\" class=\"col_heading level1 col58\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col59\" class=\"col_heading level1 col59\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col60\" class=\"col_heading level1 col60\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_5378c_level1_col61\" class=\"col_heading level1 col61\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_5378c_level1_col62\" class=\"col_heading level1 col62\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_5378c_level1_col63\" class=\"col_heading level1 col63\" >gpt-4</th>\n",
       "      <th id=\"T_5378c_level1_col64\" class=\"col_heading level1 col64\" >text-davinci-002</th>\n",
       "      <th id=\"T_5378c_level1_col65\" class=\"col_heading level1 col65\" >text-davinci-003</th>\n",
       "      <th id=\"T_5378c_level1_col66\" class=\"col_heading level1 col66\" >gpt-3.5-turbo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_5378c_level0_row0\" class=\"row_heading level0 row0\" >strategy_qa</th>\n",
       "      <td id=\"T_5378c_row0_col0\" class=\"data row0 col0\" >0.55</td>\n",
       "      <td id=\"T_5378c_row0_col1\" class=\"data row0 col1\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col2\" class=\"data row0 col2\" >0.55</td>\n",
       "      <td id=\"T_5378c_row0_col3\" class=\"data row0 col3\" >0.73</td>\n",
       "      <td id=\"T_5378c_row0_col4\" class=\"data row0 col4\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col5\" class=\"data row0 col5\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col6\" class=\"data row0 col6\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col7\" class=\"data row0 col7\" >0.70</td>\n",
       "      <td id=\"T_5378c_row0_col8\" class=\"data row0 col8\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col9\" class=\"data row0 col9\" >0.85</td>\n",
       "      <td id=\"T_5378c_row0_col10\" class=\"data row0 col10\" >0.62</td>\n",
       "      <td id=\"T_5378c_row0_col11\" class=\"data row0 col11\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col12\" class=\"data row0 col12\" >0.55</td>\n",
       "      <td id=\"T_5378c_row0_col13\" class=\"data row0 col13\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col14\" class=\"data row0 col14\" >0.48</td>\n",
       "      <td id=\"T_5378c_row0_col15\" class=\"data row0 col15\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col16\" class=\"data row0 col16\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col17\" class=\"data row0 col17\" >0.91</td>\n",
       "      <td id=\"T_5378c_row0_col18\" class=\"data row0 col18\" >0.73</td>\n",
       "      <td id=\"T_5378c_row0_col19\" class=\"data row0 col19\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col20\" class=\"data row0 col20\" >0.55</td>\n",
       "      <td id=\"T_5378c_row0_col21\" class=\"data row0 col21\" >0.45</td>\n",
       "      <td id=\"T_5378c_row0_col22\" class=\"data row0 col22\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col23\" class=\"data row0 col23\" >0.70</td>\n",
       "      <td id=\"T_5378c_row0_col24\" class=\"data row0 col24\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col25\" class=\"data row0 col25\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col26\" class=\"data row0 col26\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col27\" class=\"data row0 col27\" >0.85</td>\n",
       "      <td id=\"T_5378c_row0_col28\" class=\"data row0 col28\" >0.39</td>\n",
       "      <td id=\"T_5378c_row0_col29\" class=\"data row0 col29\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col30\" class=\"data row0 col30\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col31\" class=\"data row0 col31\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col32\" class=\"data row0 col32\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col33\" class=\"data row0 col33\" >0.76</td>\n",
       "      <td id=\"T_5378c_row0_col34\" class=\"data row0 col34\" >0.42</td>\n",
       "      <td id=\"T_5378c_row0_col35\" class=\"data row0 col35\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col36\" class=\"data row0 col36\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col37\" class=\"data row0 col37\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col38\" class=\"data row0 col38\" >0.70</td>\n",
       "      <td id=\"T_5378c_row0_col39\" class=\"data row0 col39\" >0.85</td>\n",
       "      <td id=\"T_5378c_row0_col40\" class=\"data row0 col40\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col41\" class=\"data row0 col41\" >0.70</td>\n",
       "      <td id=\"T_5378c_row0_col42\" class=\"data row0 col42\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col43\" class=\"data row0 col43\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col44\" class=\"data row0 col44\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col45\" class=\"data row0 col45\" >0.85</td>\n",
       "      <td id=\"T_5378c_row0_col46\" class=\"data row0 col46\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col47\" class=\"data row0 col47\" >0.55</td>\n",
       "      <td id=\"T_5378c_row0_col48\" class=\"data row0 col48\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col49\" class=\"data row0 col49\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col50\" class=\"data row0 col50\" >0.67</td>\n",
       "      <td id=\"T_5378c_row0_col51\" class=\"data row0 col51\" >0.88</td>\n",
       "      <td id=\"T_5378c_row0_col52\" class=\"data row0 col52\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col53\" class=\"data row0 col53\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col54\" class=\"data row0 col54\" >0.48</td>\n",
       "      <td id=\"T_5378c_row0_col55\" class=\"data row0 col55\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col56\" class=\"data row0 col56\" >0.52</td>\n",
       "      <td id=\"T_5378c_row0_col57\" class=\"data row0 col57\" >0.70</td>\n",
       "      <td id=\"T_5378c_row0_col58\" class=\"data row0 col58\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col59\" class=\"data row0 col59\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col60\" class=\"data row0 col60\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col61\" class=\"data row0 col61\" >0.61</td>\n",
       "      <td id=\"T_5378c_row0_col62\" class=\"data row0 col62\" >0.58</td>\n",
       "      <td id=\"T_5378c_row0_col63\" class=\"data row0 col63\" >0.79</td>\n",
       "      <td id=\"T_5378c_row0_col64\" class=\"data row0 col64\" >0.48</td>\n",
       "      <td id=\"T_5378c_row0_col65\" class=\"data row0 col65\" >0.64</td>\n",
       "      <td id=\"T_5378c_row0_col66\" class=\"data row0 col66\" >0.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_5378c_level0_row1\" class=\"row_heading level0 row1\" >Average</th>\n",
       "      <td id=\"T_5378c_row1_col0\" class=\"data row1 col0\" >0.55</td>\n",
       "      <td id=\"T_5378c_row1_col1\" class=\"data row1 col1\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col2\" class=\"data row1 col2\" >0.55</td>\n",
       "      <td id=\"T_5378c_row1_col3\" class=\"data row1 col3\" >0.73</td>\n",
       "      <td id=\"T_5378c_row1_col4\" class=\"data row1 col4\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col5\" class=\"data row1 col5\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col6\" class=\"data row1 col6\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col7\" class=\"data row1 col7\" >0.70</td>\n",
       "      <td id=\"T_5378c_row1_col8\" class=\"data row1 col8\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col9\" class=\"data row1 col9\" >0.85</td>\n",
       "      <td id=\"T_5378c_row1_col10\" class=\"data row1 col10\" >0.62</td>\n",
       "      <td id=\"T_5378c_row1_col11\" class=\"data row1 col11\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col12\" class=\"data row1 col12\" >0.55</td>\n",
       "      <td id=\"T_5378c_row1_col13\" class=\"data row1 col13\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col14\" class=\"data row1 col14\" >0.48</td>\n",
       "      <td id=\"T_5378c_row1_col15\" class=\"data row1 col15\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col16\" class=\"data row1 col16\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col17\" class=\"data row1 col17\" >0.91</td>\n",
       "      <td id=\"T_5378c_row1_col18\" class=\"data row1 col18\" >0.73</td>\n",
       "      <td id=\"T_5378c_row1_col19\" class=\"data row1 col19\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col20\" class=\"data row1 col20\" >0.55</td>\n",
       "      <td id=\"T_5378c_row1_col21\" class=\"data row1 col21\" >0.45</td>\n",
       "      <td id=\"T_5378c_row1_col22\" class=\"data row1 col22\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col23\" class=\"data row1 col23\" >0.70</td>\n",
       "      <td id=\"T_5378c_row1_col24\" class=\"data row1 col24\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col25\" class=\"data row1 col25\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col26\" class=\"data row1 col26\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col27\" class=\"data row1 col27\" >0.85</td>\n",
       "      <td id=\"T_5378c_row1_col28\" class=\"data row1 col28\" >0.39</td>\n",
       "      <td id=\"T_5378c_row1_col29\" class=\"data row1 col29\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col30\" class=\"data row1 col30\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col31\" class=\"data row1 col31\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col32\" class=\"data row1 col32\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col33\" class=\"data row1 col33\" >0.76</td>\n",
       "      <td id=\"T_5378c_row1_col34\" class=\"data row1 col34\" >0.42</td>\n",
       "      <td id=\"T_5378c_row1_col35\" class=\"data row1 col35\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col36\" class=\"data row1 col36\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col37\" class=\"data row1 col37\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col38\" class=\"data row1 col38\" >0.70</td>\n",
       "      <td id=\"T_5378c_row1_col39\" class=\"data row1 col39\" >0.85</td>\n",
       "      <td id=\"T_5378c_row1_col40\" class=\"data row1 col40\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col41\" class=\"data row1 col41\" >0.70</td>\n",
       "      <td id=\"T_5378c_row1_col42\" class=\"data row1 col42\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col43\" class=\"data row1 col43\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col44\" class=\"data row1 col44\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col45\" class=\"data row1 col45\" >0.85</td>\n",
       "      <td id=\"T_5378c_row1_col46\" class=\"data row1 col46\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col47\" class=\"data row1 col47\" >0.55</td>\n",
       "      <td id=\"T_5378c_row1_col48\" class=\"data row1 col48\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col49\" class=\"data row1 col49\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col50\" class=\"data row1 col50\" >0.67</td>\n",
       "      <td id=\"T_5378c_row1_col51\" class=\"data row1 col51\" >0.88</td>\n",
       "      <td id=\"T_5378c_row1_col52\" class=\"data row1 col52\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col53\" class=\"data row1 col53\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col54\" class=\"data row1 col54\" >0.48</td>\n",
       "      <td id=\"T_5378c_row1_col55\" class=\"data row1 col55\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col56\" class=\"data row1 col56\" >0.52</td>\n",
       "      <td id=\"T_5378c_row1_col57\" class=\"data row1 col57\" >0.70</td>\n",
       "      <td id=\"T_5378c_row1_col58\" class=\"data row1 col58\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col59\" class=\"data row1 col59\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col60\" class=\"data row1 col60\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col61\" class=\"data row1 col61\" >0.61</td>\n",
       "      <td id=\"T_5378c_row1_col62\" class=\"data row1 col62\" >0.58</td>\n",
       "      <td id=\"T_5378c_row1_col63\" class=\"data row1 col63\" >0.79</td>\n",
       "      <td id=\"T_5378c_row1_col64\" class=\"data row1 col64\" >0.48</td>\n",
       "      <td id=\"T_5378c_row1_col65\" class=\"data row1 col65\" >0.64</td>\n",
       "      <td id=\"T_5378c_row1_col66\" class=\"data row1 col66\" >0.58</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f38ddad7b20>"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval = coll.evaluate(overwrite=True)\n",
    "table = evaluation_as_table(eval)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/kon/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "35930fc7ad8441069b879f8780138e4b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kon/work/ThoughtSource/libs/cot/cot/stats.py:406: PerformanceWarning: indexing past lexsort depth may impact performance.\n",
      "  df.loc[dataset, (instruction + \"_\" + cot_trigger, model)] = v\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_306d7_row0_col17, #T_306d7_row1_col17 {\n",
       "  font-weight: bold;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_306d7\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_306d7_level0_col0\" class=\"col_heading level0 col0\" colspan=\"6\">None_None</th>\n",
       "      <th id=\"T_306d7_level0_col6\" class=\"col_heading level0 col6\" colspan=\"6\">None_kojima-01</th>\n",
       "      <th id=\"T_306d7_level0_col12\" class=\"col_heading level0 col12\" >None_kojima-03</th>\n",
       "      <th id=\"T_306d7_level0_col13\" class=\"col_heading level0 col13\" >None_kojima-09</th>\n",
       "      <th id=\"T_306d7_level0_col14\" class=\"col_heading level0 col14\" colspan=\"6\">None_zhou-01</th>\n",
       "      <th id=\"T_306d7_level0_col20\" class=\"col_heading level0 col20\" >qa-01_None</th>\n",
       "      <th id=\"T_306d7_level0_col21\" class=\"col_heading level0 col21\" >qa-05_None</th>\n",
       "      <th id=\"T_306d7_level0_col22\" class=\"col_heading level0 col22\" >qa-08_None</th>\n",
       "      <th id=\"T_306d7_level0_col23\" class=\"col_heading level0 col23\" >qa-09_None</th>\n",
       "      <th id=\"T_306d7_level0_col24\" class=\"col_heading level0 col24\" colspan=\"6\">qa-10_None</th>\n",
       "      <th id=\"T_306d7_level0_col30\" class=\"col_heading level0 col30\" colspan=\"6\">qa-12_None</th>\n",
       "      <th id=\"T_306d7_level0_col36\" class=\"col_heading level0 col36\" colspan=\"6\">qa-13_None</th>\n",
       "      <th id=\"T_306d7_level0_col42\" class=\"col_heading level0 col42\" colspan=\"6\">qa-16_None</th>\n",
       "      <th id=\"T_306d7_level0_col48\" class=\"col_heading level0 col48\" colspan=\"6\">qa-17_None</th>\n",
       "      <th id=\"T_306d7_level0_col54\" class=\"col_heading level0 col54\" colspan=\"6\">refl-01_None</th>\n",
       "      <th id=\"T_306d7_level0_col60\" class=\"col_heading level0 col60\" colspan=\"6\">zhou-01-ins_None</th>\n",
       "      <th id=\"T_306d7_level0_col66\" class=\"col_heading level0 col66\" >zhou-01-ins_zhou-01</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th id=\"T_306d7_level1_col0\" class=\"col_heading level1 col0\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col1\" class=\"col_heading level1 col1\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col2\" class=\"col_heading level1 col2\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col3\" class=\"col_heading level1 col3\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col4\" class=\"col_heading level1 col4\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col5\" class=\"col_heading level1 col5\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col6\" class=\"col_heading level1 col6\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col7\" class=\"col_heading level1 col7\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col8\" class=\"col_heading level1 col8\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col9\" class=\"col_heading level1 col9\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col10\" class=\"col_heading level1 col10\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col11\" class=\"col_heading level1 col11\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col12\" class=\"col_heading level1 col12\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col13\" class=\"col_heading level1 col13\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col14\" class=\"col_heading level1 col14\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col15\" class=\"col_heading level1 col15\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col16\" class=\"col_heading level1 col16\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col17\" class=\"col_heading level1 col17\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col18\" class=\"col_heading level1 col18\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col19\" class=\"col_heading level1 col19\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col20\" class=\"col_heading level1 col20\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col21\" class=\"col_heading level1 col21\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col22\" class=\"col_heading level1 col22\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col23\" class=\"col_heading level1 col23\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col24\" class=\"col_heading level1 col24\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col25\" class=\"col_heading level1 col25\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col26\" class=\"col_heading level1 col26\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col27\" class=\"col_heading level1 col27\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col28\" class=\"col_heading level1 col28\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col29\" class=\"col_heading level1 col29\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col30\" class=\"col_heading level1 col30\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col31\" class=\"col_heading level1 col31\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col32\" class=\"col_heading level1 col32\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col33\" class=\"col_heading level1 col33\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col34\" class=\"col_heading level1 col34\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col35\" class=\"col_heading level1 col35\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col36\" class=\"col_heading level1 col36\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col37\" class=\"col_heading level1 col37\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col38\" class=\"col_heading level1 col38\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col39\" class=\"col_heading level1 col39\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col40\" class=\"col_heading level1 col40\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col41\" class=\"col_heading level1 col41\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col42\" class=\"col_heading level1 col42\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col43\" class=\"col_heading level1 col43\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col44\" class=\"col_heading level1 col44\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col45\" class=\"col_heading level1 col45\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col46\" class=\"col_heading level1 col46\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col47\" class=\"col_heading level1 col47\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col48\" class=\"col_heading level1 col48\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col49\" class=\"col_heading level1 col49\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col50\" class=\"col_heading level1 col50\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col51\" class=\"col_heading level1 col51\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col52\" class=\"col_heading level1 col52\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col53\" class=\"col_heading level1 col53\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col54\" class=\"col_heading level1 col54\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col55\" class=\"col_heading level1 col55\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col56\" class=\"col_heading level1 col56\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col57\" class=\"col_heading level1 col57\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col58\" class=\"col_heading level1 col58\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col59\" class=\"col_heading level1 col59\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col60\" class=\"col_heading level1 col60\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_306d7_level1_col61\" class=\"col_heading level1 col61\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_306d7_level1_col62\" class=\"col_heading level1 col62\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_306d7_level1_col63\" class=\"col_heading level1 col63\" >gpt-4</th>\n",
       "      <th id=\"T_306d7_level1_col64\" class=\"col_heading level1 col64\" >text-davinci-002</th>\n",
       "      <th id=\"T_306d7_level1_col65\" class=\"col_heading level1 col65\" >text-davinci-003</th>\n",
       "      <th id=\"T_306d7_level1_col66\" class=\"col_heading level1 col66\" >gpt-3.5-turbo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_306d7_level0_row0\" class=\"row_heading level0 row0\" >strategy_qa</th>\n",
       "      <td id=\"T_306d7_row0_col0\" class=\"data row0 col0\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col1\" class=\"data row0 col1\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col2\" class=\"data row0 col2\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col3\" class=\"data row0 col3\" >0.73</td>\n",
       "      <td id=\"T_306d7_row0_col4\" class=\"data row0 col4\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col5\" class=\"data row0 col5\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col6\" class=\"data row0 col6\" >0.58</td>\n",
       "      <td id=\"T_306d7_row0_col7\" class=\"data row0 col7\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col8\" class=\"data row0 col8\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col9\" class=\"data row0 col9\" >0.85</td>\n",
       "      <td id=\"T_306d7_row0_col10\" class=\"data row0 col10\" >0.52</td>\n",
       "      <td id=\"T_306d7_row0_col11\" class=\"data row0 col11\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col12\" class=\"data row0 col12\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col13\" class=\"data row0 col13\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col14\" class=\"data row0 col14\" >0.24</td>\n",
       "      <td id=\"T_306d7_row0_col15\" class=\"data row0 col15\" >0.48</td>\n",
       "      <td id=\"T_306d7_row0_col16\" class=\"data row0 col16\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col17\" class=\"data row0 col17\" >0.91</td>\n",
       "      <td id=\"T_306d7_row0_col18\" class=\"data row0 col18\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col19\" class=\"data row0 col19\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col20\" class=\"data row0 col20\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col21\" class=\"data row0 col21\" >0.45</td>\n",
       "      <td id=\"T_306d7_row0_col22\" class=\"data row0 col22\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col23\" class=\"data row0 col23\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col24\" class=\"data row0 col24\" >0.12</td>\n",
       "      <td id=\"T_306d7_row0_col25\" class=\"data row0 col25\" >0.58</td>\n",
       "      <td id=\"T_306d7_row0_col26\" class=\"data row0 col26\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col27\" class=\"data row0 col27\" >0.85</td>\n",
       "      <td id=\"T_306d7_row0_col28\" class=\"data row0 col28\" >0.39</td>\n",
       "      <td id=\"T_306d7_row0_col29\" class=\"data row0 col29\" >0.58</td>\n",
       "      <td id=\"T_306d7_row0_col30\" class=\"data row0 col30\" >0.09</td>\n",
       "      <td id=\"T_306d7_row0_col31\" class=\"data row0 col31\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col32\" class=\"data row0 col32\" >0.52</td>\n",
       "      <td id=\"T_306d7_row0_col33\" class=\"data row0 col33\" >0.76</td>\n",
       "      <td id=\"T_306d7_row0_col34\" class=\"data row0 col34\" >0.42</td>\n",
       "      <td id=\"T_306d7_row0_col35\" class=\"data row0 col35\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col36\" class=\"data row0 col36\" >0.12</td>\n",
       "      <td id=\"T_306d7_row0_col37\" class=\"data row0 col37\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col38\" class=\"data row0 col38\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col39\" class=\"data row0 col39\" >0.85</td>\n",
       "      <td id=\"T_306d7_row0_col40\" class=\"data row0 col40\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col41\" class=\"data row0 col41\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col42\" class=\"data row0 col42\" >0.33</td>\n",
       "      <td id=\"T_306d7_row0_col43\" class=\"data row0 col43\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col44\" class=\"data row0 col44\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col45\" class=\"data row0 col45\" >0.85</td>\n",
       "      <td id=\"T_306d7_row0_col46\" class=\"data row0 col46\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col47\" class=\"data row0 col47\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col48\" class=\"data row0 col48\" >0.30</td>\n",
       "      <td id=\"T_306d7_row0_col49\" class=\"data row0 col49\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col50\" class=\"data row0 col50\" >0.67</td>\n",
       "      <td id=\"T_306d7_row0_col51\" class=\"data row0 col51\" >0.88</td>\n",
       "      <td id=\"T_306d7_row0_col52\" class=\"data row0 col52\" >0.45</td>\n",
       "      <td id=\"T_306d7_row0_col53\" class=\"data row0 col53\" >0.58</td>\n",
       "      <td id=\"T_306d7_row0_col54\" class=\"data row0 col54\" >0.06</td>\n",
       "      <td id=\"T_306d7_row0_col55\" class=\"data row0 col55\" >0.48</td>\n",
       "      <td id=\"T_306d7_row0_col56\" class=\"data row0 col56\" >0.52</td>\n",
       "      <td id=\"T_306d7_row0_col57\" class=\"data row0 col57\" >0.70</td>\n",
       "      <td id=\"T_306d7_row0_col58\" class=\"data row0 col58\" >0.55</td>\n",
       "      <td id=\"T_306d7_row0_col59\" class=\"data row0 col59\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col60\" class=\"data row0 col60\" >0.09</td>\n",
       "      <td id=\"T_306d7_row0_col61\" class=\"data row0 col61\" >0.61</td>\n",
       "      <td id=\"T_306d7_row0_col62\" class=\"data row0 col62\" >0.58</td>\n",
       "      <td id=\"T_306d7_row0_col63\" class=\"data row0 col63\" >0.79</td>\n",
       "      <td id=\"T_306d7_row0_col64\" class=\"data row0 col64\" >0.48</td>\n",
       "      <td id=\"T_306d7_row0_col65\" class=\"data row0 col65\" >0.64</td>\n",
       "      <td id=\"T_306d7_row0_col66\" class=\"data row0 col66\" >0.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_306d7_level0_row1\" class=\"row_heading level0 row1\" >Average</th>\n",
       "      <td id=\"T_306d7_row1_col0\" class=\"data row1 col0\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col1\" class=\"data row1 col1\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col2\" class=\"data row1 col2\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col3\" class=\"data row1 col3\" >0.73</td>\n",
       "      <td id=\"T_306d7_row1_col4\" class=\"data row1 col4\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col5\" class=\"data row1 col5\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col6\" class=\"data row1 col6\" >0.58</td>\n",
       "      <td id=\"T_306d7_row1_col7\" class=\"data row1 col7\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col8\" class=\"data row1 col8\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col9\" class=\"data row1 col9\" >0.85</td>\n",
       "      <td id=\"T_306d7_row1_col10\" class=\"data row1 col10\" >0.52</td>\n",
       "      <td id=\"T_306d7_row1_col11\" class=\"data row1 col11\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col12\" class=\"data row1 col12\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col13\" class=\"data row1 col13\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col14\" class=\"data row1 col14\" >0.24</td>\n",
       "      <td id=\"T_306d7_row1_col15\" class=\"data row1 col15\" >0.48</td>\n",
       "      <td id=\"T_306d7_row1_col16\" class=\"data row1 col16\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col17\" class=\"data row1 col17\" >0.91</td>\n",
       "      <td id=\"T_306d7_row1_col18\" class=\"data row1 col18\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col19\" class=\"data row1 col19\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col20\" class=\"data row1 col20\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col21\" class=\"data row1 col21\" >0.45</td>\n",
       "      <td id=\"T_306d7_row1_col22\" class=\"data row1 col22\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col23\" class=\"data row1 col23\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col24\" class=\"data row1 col24\" >0.12</td>\n",
       "      <td id=\"T_306d7_row1_col25\" class=\"data row1 col25\" >0.58</td>\n",
       "      <td id=\"T_306d7_row1_col26\" class=\"data row1 col26\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col27\" class=\"data row1 col27\" >0.85</td>\n",
       "      <td id=\"T_306d7_row1_col28\" class=\"data row1 col28\" >0.39</td>\n",
       "      <td id=\"T_306d7_row1_col29\" class=\"data row1 col29\" >0.58</td>\n",
       "      <td id=\"T_306d7_row1_col30\" class=\"data row1 col30\" >0.09</td>\n",
       "      <td id=\"T_306d7_row1_col31\" class=\"data row1 col31\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col32\" class=\"data row1 col32\" >0.52</td>\n",
       "      <td id=\"T_306d7_row1_col33\" class=\"data row1 col33\" >0.76</td>\n",
       "      <td id=\"T_306d7_row1_col34\" class=\"data row1 col34\" >0.42</td>\n",
       "      <td id=\"T_306d7_row1_col35\" class=\"data row1 col35\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col36\" class=\"data row1 col36\" >0.12</td>\n",
       "      <td id=\"T_306d7_row1_col37\" class=\"data row1 col37\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col38\" class=\"data row1 col38\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col39\" class=\"data row1 col39\" >0.85</td>\n",
       "      <td id=\"T_306d7_row1_col40\" class=\"data row1 col40\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col41\" class=\"data row1 col41\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col42\" class=\"data row1 col42\" >0.33</td>\n",
       "      <td id=\"T_306d7_row1_col43\" class=\"data row1 col43\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col44\" class=\"data row1 col44\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col45\" class=\"data row1 col45\" >0.85</td>\n",
       "      <td id=\"T_306d7_row1_col46\" class=\"data row1 col46\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col47\" class=\"data row1 col47\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col48\" class=\"data row1 col48\" >0.30</td>\n",
       "      <td id=\"T_306d7_row1_col49\" class=\"data row1 col49\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col50\" class=\"data row1 col50\" >0.67</td>\n",
       "      <td id=\"T_306d7_row1_col51\" class=\"data row1 col51\" >0.88</td>\n",
       "      <td id=\"T_306d7_row1_col52\" class=\"data row1 col52\" >0.45</td>\n",
       "      <td id=\"T_306d7_row1_col53\" class=\"data row1 col53\" >0.58</td>\n",
       "      <td id=\"T_306d7_row1_col54\" class=\"data row1 col54\" >0.06</td>\n",
       "      <td id=\"T_306d7_row1_col55\" class=\"data row1 col55\" >0.48</td>\n",
       "      <td id=\"T_306d7_row1_col56\" class=\"data row1 col56\" >0.52</td>\n",
       "      <td id=\"T_306d7_row1_col57\" class=\"data row1 col57\" >0.70</td>\n",
       "      <td id=\"T_306d7_row1_col58\" class=\"data row1 col58\" >0.55</td>\n",
       "      <td id=\"T_306d7_row1_col59\" class=\"data row1 col59\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col60\" class=\"data row1 col60\" >0.09</td>\n",
       "      <td id=\"T_306d7_row1_col61\" class=\"data row1 col61\" >0.61</td>\n",
       "      <td id=\"T_306d7_row1_col62\" class=\"data row1 col62\" >0.58</td>\n",
       "      <td id=\"T_306d7_row1_col63\" class=\"data row1 col63\" >0.79</td>\n",
       "      <td id=\"T_306d7_row1_col64\" class=\"data row1 col64\" >0.48</td>\n",
       "      <td id=\"T_306d7_row1_col65\" class=\"data row1 col65\" >0.64</td>\n",
       "      <td id=\"T_306d7_row1_col66\" class=\"data row1 col66\" >0.58</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f37e55b51e0>"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coll = Collection.load_thoughtsource_33()\n",
    "coll.unload_datasets(\"strategy_qa\", reverse=True)\n",
    "eval = coll.evaluate()\n",
    "table = evaluation_as_table(eval)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "054f3367edca4d139065a2f24de12d4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "coll.dump(\"eval_bool_3\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# check acc vs krippendorff alpha gpt4 commonsense_qa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll = Collection.from_json(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_33_paper.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.unload_datasets(\"commonsense_qa\", reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.select_generated_cots(model = \"gpt-4\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.select_generated_cots(answer = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "commonsense_qa {0, 1, 2, 4, 5, 6, 9, 10}\n"
     ]
    }
   ],
   "source": [
    "coll.number_generated_cots()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.dump(\"eval_commonsense_gpt4_false\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# unknown answers as null instead of wrong"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll = Collection.from_json(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_33_paper.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll.evaluate(overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5ef261cd72a243a29568a3f5c941b79c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "893a2a71cc8d4489b1a40acc3045f773",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "591467b3830d41df9156ad8b097a876d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "49a5665e7c454b82a62fc42ae88fc1d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "44256700a3c3473eac454f154351e76d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "91652fa6cc7c46b087a5e9c70db95a61",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "coll.dump(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_33_paper.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "72fa2b8cde8f4b83909117173591f485",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "28a16b45dc1c4fe3a0084c852c40e254",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cbde778559a244d98ff25de9f007c3c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8e7e4482c86144d1914a50a42408c202",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "768c88df09a6455286b28c70ed32aa1b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "570e2ddace634ebbb07e873af7aace75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a7cfba7566ec436e85bf0e22a667e0ec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d1c26099ebd94490868cc85516a8a675",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a9682b5e5a84bc2bd4be8666b7f666e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bd2b4407712948dc90f55dbd82113822",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a5282be6ec2b4f4fa65a1e74f431290f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f285a46758834e8abe499a86d4593026",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/33 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0bb6fb78e83d49da99c702e326df9d73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cdcbf25988ae45ac97a8c77f23d28954",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "41845d2ec9a64b039ce6ae859c67f67b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "65dce3e2e3864f0796a386e9ae3aee8a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "97af9054b9544fad8b313f8ee06f450f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9cd6542e63394991a3288e971eb3b422",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "63d718378f2f47d7ace3a4683dbdc81f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "59b703ee2cae4d9883e3aefc081df417",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8b5022c50d0f48ccb3fa661d0093d51e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "71b115983ed94764a49bf95b3392b00f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c360017fa19f4ee2832bcdbde823b1b1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7baf95c8e85a4e8a82eb38035f5fbfbd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d91ecc5faf854c518e815c722c48af69",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "642adba06f0f4c7e9b3ad7db38b07760",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13f7d8bcc2cc43fabdbd16b6b5ef8865",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dbcfa4ecfe7744a4b35c73b25e12fd3c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "58f145a08d2e45ab830a524e4c7e1b71",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "44feccd0a31749cfb3443e68d65b6449",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found difference in collection old/new evaluation overwrite. Files for comparison are created: compare_evaluation_20230426-162942_a_old.json and compare_evaluation_20230426-162942_b_new.json\n"
     ]
    }
   ],
   "source": [
    "compare_evaluation_difference(coll)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
